########## GLOBALS ##########
rm(list = ls())

# Load packages
library(igraph)
library(dplyr)
library(estimatr)
library(ggplot2)
library(ggfortify)

# Load data files
edgelist <- read.csv("data/edgelist.csv")
county_incomes <- read.csv("data/saipe_incomes.csv")

# Restrict to desired cols
edgelist <- edgelist[, c("start_id", "end_id", "state_county")]

########## GRAPH SETUP ##########
# Convert to graph
transit_g <- graph_from_data_frame(edgelist, directed = TRUE, vertices = NULL)

# Prepare state_county for unique IDs
vertex_data <- data.frame(id = c(edgelist$start_id, edgelist$end_id),
                          state_county = c(edgelist$state_county[match(edgelist$start_id, edgelist$start_id)],
                                           edgelist$state_county[match(edgelist$end_id, edgelist$end_id)]))

# De-duplicate state_county per unique ID
vertex_data <- vertex_data[!duplicated(vertex_data$id), ]

# Assign state_county as a vertex attribute
V(transit_g)$state_county <- vertex_data$state_county[match(V(transit_g)$name, vertex_data$id)]

# Remove state_county attribute from edges and clean workspace
transit_g <- delete_edge_attr(transit_g, "state_county")
rm(vertex_data)

# Add centrality measures
V(transit_g)$degree <- degree(transit_g)
V(transit_g)$closeness <- closeness(transit_g)

########## REFORMAT DATA ##########
# Create a data frame from vertex attributes
vertex_df <- data.frame(
  state_county = V(transit_g)$state_county, # State and county attribute
  degree = V(transit_g)$degree, # Degree centrality
  closeness = V(transit_g)$closeness # Closeness centrality
)

# Aggregate to the county-level
county_agg <- vertex_df %>%
  group_by(state_county) %>%
  summarise(
    num_nodes = n(),
    avg_degree = mean(degree, na.rm = TRUE),
    avg_closeness = mean(closeness, na.rm = TRUE),
    wavg_degree = sum(degree, na.rm = TRUE), # cancels out `num_nodes`
    wavg_closeness = sum(closeness, na.rm = TRUE) # cancels out `num_nodes`
  )
county_agg <- as.data.frame(county_agg)

# Add state column
county_agg$state <- substr(county_agg$state_county, 1, 2)
county_agg$state <- gsub("_", "", county_agg$state)

# Add median income
county_agg <- left_join(county_agg,
                        county_incomes,
                        by = "state_county")

# Subset to relevant cols
county_agg <- county_agg[, c("state",
                             "Name",
                             "state_county",
                             "Median.Household.Income",
                             "num_nodes",
                             "avg_degree",
                             "avg_closeness",
                             "wavg_degree",
                             "wavg_closeness")]

# Rename cols
colnames(county_agg) <- c("state_fips",
                          "county_name",
                          "state_county",
                          "median_income",
                          "num_nodes",
                          "avg_degree",
                          "avg_closeness",
                          "wavg_degree",
                          "wavg_closeness")

# Coerce median income to numeric
county_agg$median_income <- gsub(",", "", county_agg$median_income)
county_agg$median_income <- as.numeric(county_agg$median_income)

# Clean workspace
rm(county_incomes)

########## TRANSFORMATIONS ##########
# Use the weighted DVs (sum instead of mean)
# Log all variables
county_agg$log.weighted_avg_degree <- log(county_agg$wavg_degree)
county_agg$log.weighted_avg_closeness <- log(county_agg$wavg_closeness + 1) # add 1 to avoid -Inf
county_agg$log.median_income <- log(county_agg$median_income)

########## REGRESSIONS ##########
# OLS Models with state-fixed effects but WITHOUT SE clusters and FPC (for diagnostic plots)
degree_income_model <- lm(log.weighted_avg_degree ~
                            log.median_income +
                            factor(state_fips),
                          data = county_agg)
closeness_income_model <- lm(log.weighted_avg_closeness ~
                               log.median_income +
                               factor(state_fips),
                             data = county_agg)

# Calculate FPCs
N <- 3143 # Total num. US counties
n_deg <- nobs(degree_income_model) # sample size degree
n_clo <- nobs(closeness_income_model) # sample size closeness
fpc_deg <- sqrt((N - n_deg) / (N - 1)) # FPC for degree
fpc_clo <- sqrt((N - n_clo) / (N - 1))# FPC for closeness

# OLS Models with state-fixed effects and SE clusters and FPC
degree_income_model.fpc <- lm_robust(log.weighted_avg_degree ~
                                       log.median_income +
                                       factor(state_fips),
                                     data = county_agg,
                                     clusters = county_agg$state_fips,
                                     weights = rep(fpc_deg, nrow(county_agg)))
closeness_income_model.fpc <- lm_robust(log.weighted_avg_closeness ~
                                          log.median_income +
                                          factor(state_fips),
                                        data = county_agg,
                                        clusters = county_agg$state_fips,
                                        weights = rep(fpc_clo, nrow(county_agg)))

# Calculate Residual SE for FPC models
degree_income_model.fpc$res_se <- sqrt(degree_income_model.fpc$res_var)
closeness_income_model.fpc$res_se <- sqrt(closeness_income_model.fpc$res_var)

########## DIAGNOSTIC PLOTS ##########
# Degree centrality model
pdf("output/degree_diagnostics.pdf", 6, 6)
autoplot(degree_income_model, 
         which = c(1:3, 5),  # Selects Residuals vs Fitted, QQ plot, Scale-Location, and Residuals vs Leverage
         ncol = 2, 
         label.size = 0,
         alpha = 0.15,
         colour = "blue",
         smooth.colour = "red",
         shape = 1) + 
  theme_bw(base_size = 15) +
  theme(
    plot.title = element_text(face = "bold", size = rel(0.6), hjust = 0.5),
    axis.title = element_text(size = rel(0.5)),
    axis.text = element_text(size = rel(0.5)),
    panel.grid = element_blank()
  )
dev.off()

# Closeness centrality model
pdf("output/closeness_diagnostics.pdf", 6, 6)
autoplot(closeness_income_model, 
         which = c(1:3, 5),  # Selects Residuals vs Fitted, QQ plot, Scale-Location, and Residuals vs Leverage
         ncol = 2, 
         label.size = 0,
         alpha = 0.15,
         colour = "blue",
         smooth.colour = "red",
         shape = 1) + 
  theme_bw(base_size = 15) +
  theme(
    plot.title = element_text(face = "bold", size = rel(0.6), hjust = 0.5),
    axis.title = element_text(size = rel(0.5)),
    axis.text = element_text(size = rel(0.5)),
    panel.grid = element_blank()
  )
dev.off()